In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

df=pd.read_csv(r'C:\Users\TajwarAbtahee\OneDrive - JCW Resourcing\Desktop\Python\practice\amazon.csv')
df.head()
Out[1]:
year state month number date
0 1998 Acre Janeiro 0.0 1998-01-01
1 1999 Acre Janeiro 0.0 1999-01-01
2 2000 Acre Janeiro 0.0 2000-01-01
3 2001 Acre Janeiro 0.0 2001-01-01
4 2002 Acre Janeiro 0.0 2002-01-01
In [2]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6454 entries, 0 to 6453
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    6454 non-null   int64  
 1   state   6454 non-null   object 
 2   month   6454 non-null   object 
 3   number  6454 non-null   float64
 4   date    6454 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 252.2+ KB
In [3]:
df.month.value_counts()
Out[3]:
Janeiro      541
Outubro      540
Julho        540
Novembro     540
Junho        540
Agosto       540
Maio         540
Mar�o        540
Setembro     540
Abril        540
Fevereiro    540
Dezembro     513
Name: month, dtype: int64
In [4]:
df['month']=df.month.map({'Janeiro':'January',
                          'Fevereiro':'February',
                          'Mar�o':'March',
                          'Abril':'April',
                         'Maio':'May',
                         'Junho':'June',
                         'Julho':'July',
                         'Agosto':'August',
                         'Setembro':'September',
                         'Outubro':'October',
                         'Novembro':'November',
                         'Dezembro':'December'}) 
In [5]:
df.month.value_counts()
Out[5]:
January      541
August       540
March        540
November     540
April        540
June         540
September    540
July         540
May          540
February     540
October      540
December     513
Name: month, dtype: int64
In [6]:
df.date=pd.to_datetime(df.date)
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6454 entries, 0 to 6453
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   year    6454 non-null   int64         
 1   state   6454 non-null   object        
 2   month   6454 non-null   object        
 3   number  6454 non-null   float64       
 4   date    6454 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 252.2+ KB
In [8]:
df.describe()
Out[8]:
year number
count 6454.000000 6454.000000
mean 2007.461729 108.293163
std 5.746654 190.812242
min 1998.000000 0.000000
25% 2002.000000 3.000000
50% 2007.000000 24.000000
75% 2012.000000 113.000000
max 2017.000000 998.000000
In [9]:
years=df.groupby('year').sum().reset_index()
years.columns
Out[9]:
Index(['year', 'number'], dtype='object')
In [10]:
plt.figure(figsize=(20,10))
sns.lineplot(data=years,x='year',y='number',marker='o').set(xticks=years.year)
plt.show()
In [11]:
px.line(years,x='year',y='number') #decline for 6 year from 2003, and then a steady rise from 2008 onwards
In [12]:
states=df.groupby('state').sum().sort_values('number',ascending=False).reset_index()
px.bar(states,x='state',y='number',text_auto='.2s')
#Mato Grasso has almost double the amount of cases compared to each region in the top 5
In [13]:
months=df.groupby('month').mean().reset_index()
months=months.reindex([4,3,7,0,8,6,5,1,11,10,9,2])
px.line(months,x='month',y='number')

#Feb to May can be considered as the more calm times of the year but then drastically rise during the summer time and winter time cooling off after november, with an exceptopn of a dry spell in september
In [14]:
df.head()
Out[14]:
year state month number date
0 1998 Acre January 0.0 1998-01-01
1 1999 Acre January 0.0 1999-01-01
2 2000 Acre January 0.0 2000-01-01
3 2001 Acre January 0.0 2001-01-01
4 2002 Acre January 0.0 2002-01-01
In [15]:
state_year=df.groupby(['year','state']).sum().reset_index()
state_year
Out[15]:
year state number
0 1998 Acre 730.000
1 1998 Alagoas 86.000
2 1998 Amapa 278.000
3 1998 Amazonas 946.000
4 1998 Bahia 1224.687
... ... ... ...
455 2017 Roraima 1101.000
456 2017 Santa Catarina 2354.000
457 2017 Sao Paulo 2540.868
458 2017 Sergipe 75.000
459 2017 Tocantins 1378.959

460 rows × 3 columns

In [16]:
px.line(state_year,x='year',y='number',color='state')

#Mato Grasso as mentioned previously has had a higher case rate from 1998 compared to all other regions
#areas such as Sergepie and Alagaoshave a consistent low rate thoughout the various years
In [17]:
#lets find the average number of fires
case_ave=df.groupby('state').mean().sort_values('number',ascending=False).reset_index()
px.bar(case_ave,x='state',y='number',text_auto='.3s')
#although Mato Grasso had the highest number of cases it can be seen that Sao Paulo has the highest numebr of cases day to day basis